0.1 Intalling Required Packages

packages = c('rgdal', 'spdep', 'tmap', 'sf', 'ggpubr', 'cluster', 'factoextra', 'NbClust', 'heatmaply', 'corrplot', 'psych', 'tidyverse')
for (p in packages){
  if(!require(p, character.only = T)){
    install.packages(p)
  }
  library(p,character.only = T)
}
## Warning: package 'factoextra' was built under R version 4.0.3
## Warning: package 'NbClust' was built under R version 4.0.3

0.2 Data Import and Preparation

0.2.1 Importing Aspatial and Spatial Data

corp_info_merged <- read_csv("data/aspatial/corp_info_merged.csv")
## Parsed with column specification:
## cols(
##   uen = col_character(),
##   registration_year = col_double(),
##   primary_ssic_code = col_character(),
##   postal_code = col_double(),
##   category = col_character(),
##   primary_ssic_category_description = col_character(),
##   X_coord = col_double(),
##   Y_coord = col_double()
## )
corp_info_merged <- st_as_sf(corp_info_merged, coords = c('X_coord','Y_coord'), crs = 3414)
ssic2020 <- read_csv("data/aspatial/ssic2020.csv")
## Parsed with column specification:
## cols(
##   `SSIC 2020` = col_character(),
##   category = col_character(),
##   primary_ssic_code = col_character()
## )
postal_code_geom <- read_csv("data/aspatial/postal_code_geom.csv")
## Parsed with column specification:
## cols(
##   postal_code = col_double(),
##   X_coord = col_double(),
##   Y_coord = col_double()
## )
mpsz <- st_read(dsn = "data/geospatial", layer="MP14_SUBZONE_WEB_PL")
## Reading layer `MP14_SUBZONE_WEB_PL' from data source `D:\GitHub\is415-proj\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 323 features and 15 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 2667.538 ymin: 15748.72 xmax: 56396.44 ymax: 50256.33
## projected CRS:  SVY21
planning_area <- mpsz %>%
  group_by(PLN_AREA_N) %>%
  summarise(geometry = st_union(geometry)) %>%
  ungroup() %>%
  filter(!(PLN_AREA_N %in% c("NORTH-EASTERN ISLANDS",
                             "CENTRAL WATER CATCHMENT",
                             "CHANGI BAY",
                             "MARINA SOUTH",
                             "SIMPANG",
                             "SOUTHERN ISLANDS",
                             "STRAITS VIEW",
                             "TENGAH")))
## `summarise()` ungrouping output (override with `.groups` argument)

0.2.2 Checking the CRS for Spatial Data

planning_area
## Simple feature collection with 47 features and 1 field
## geometry type:  GEOMETRY
## dimension:      XY
## bbox:           xmin: 2667.538 ymin: 15748.72 xmax: 50293.96 ymax: 50256.33
## projected CRS:  SVY21
## # A tibble: 47 x 2
##    PLN_AREA_N                                                           geometry
##  * <chr>                                                           <POLYGON [m]>
##  1 ANG MO KIO    ((31070.93 39048.68, 31072.47 39019.76, 31072.2 38992.06, 3106~
##  2 BEDOK         ((39348.48 32017.65, 39346.79 32013.97, 39316.22 31997.18, 393~
##  3 BISHAN        ((30988.88 36251.95, 31016.58 36176.08, 31017.98 36172.26, 309~
##  4 BOON LAY      ((12643.21 32171.07, 12643.06 32171.43, 12614.5 32239.82, 1261~
##  5 BUKIT BATOK   ((20198.51 36532.01, 20205.86 36519.86, 20265.9 36431.76, 2030~
##  6 BUKIT MERAH   ((28983.37 28341.95, 29112.53 28345.02, 29182.45 28346.68, 291~
##  7 BUKIT PANJANG ((23235.16 37023.92, 23363.43 36821, 23324.07 36810.26, 23223.~
##  8 BUKIT TIMAH   ((23527.63 32853.63, 23397.96 32719.03, 23326.3 32643.58, 2331~
##  9 CHANGI        ((45818.88 41497.37, 45838.76 41496.62, 45875.88 41498.13, 458~
## 10 CHOA CHU KANG ((18700.53 39143.88, 18559.17 39008.54, 18298.41 39242.47, 182~
## # ... with 37 more rows
st_crs(planning_area)
## Coordinate Reference System:
##   User input: SVY21 
##   wkt:
## PROJCRS["SVY21",
##     BASEGEOGCRS["SVY21[WGS84]",
##         DATUM["World Geodetic System 1984",
##             ELLIPSOID["WGS 84",6378137,298.257223563,
##                 LENGTHUNIT["metre",1]],
##             ID["EPSG",6326]],
##         PRIMEM["Greenwich",0,
##             ANGLEUNIT["Degree",0.0174532925199433]]],
##     CONVERSION["unnamed",
##         METHOD["Transverse Mercator",
##             ID["EPSG",9807]],
##         PARAMETER["Latitude of natural origin",1.36666666666667,
##             ANGLEUNIT["Degree",0.0174532925199433],
##             ID["EPSG",8801]],
##         PARAMETER["Longitude of natural origin",103.833333333333,
##             ANGLEUNIT["Degree",0.0174532925199433],
##             ID["EPSG",8802]],
##         PARAMETER["Scale factor at natural origin",1,
##             SCALEUNIT["unity",1],
##             ID["EPSG",8805]],
##         PARAMETER["False easting",28001.642,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8806]],
##         PARAMETER["False northing",38744.572,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8807]]],
##     CS[Cartesian,2],
##         AXIS["(E)",east,
##             ORDER[1],
##             LENGTHUNIT["metre",1,
##                 ID["EPSG",9001]]],
##         AXIS["(N)",north,
##             ORDER[2],
##             LENGTHUNIT["metre",1,
##                 ID["EPSG",9001]]]]

0.2.3 Transforming the CRS for Spatial Data

planning_area_3414 <- st_transform(planning_area, 3414)
st_crs(planning_area_3414)
## Coordinate Reference System:
##   User input: EPSG:3414 
##   wkt:
## PROJCRS["SVY21 / Singapore TM",
##     BASEGEOGCRS["SVY21",
##         DATUM["SVY21",
##             ELLIPSOID["WGS 84",6378137,298.257223563,
##                 LENGTHUNIT["metre",1]]],
##         PRIMEM["Greenwich",0,
##             ANGLEUNIT["degree",0.0174532925199433]],
##         ID["EPSG",4757]],
##     CONVERSION["Singapore Transverse Mercator",
##         METHOD["Transverse Mercator",
##             ID["EPSG",9807]],
##         PARAMETER["Latitude of natural origin",1.36666666666667,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8801]],
##         PARAMETER["Longitude of natural origin",103.833333333333,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8802]],
##         PARAMETER["Scale factor at natural origin",1,
##             SCALEUNIT["unity",1],
##             ID["EPSG",8805]],
##         PARAMETER["False easting",28001.642,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8806]],
##         PARAMETER["False northing",38744.572,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8807]]],
##     CS[Cartesian,2],
##         AXIS["northing (N)",north,
##             ORDER[1],
##             LENGTHUNIT["metre",1]],
##         AXIS["easting (E)",east,
##             ORDER[2],
##             LENGTHUNIT["metre",1]],
##     USAGE[
##         SCOPE["unknown"],
##         AREA["Singapore"],
##         BBOX[1.13,103.59,1.47,104.07]],
##     ID["EPSG",3414]]

0.2.4 Converting to Spatial or Spatial* Equivalents

corp_info_merged_sp <- as(corp_info_merged, "Spatial")
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO"): Discarded datum Unknown based on WGS84 ellipsoid in CRS definition,
##  but +towgs84= values preserved
corp_info_merged_sp <- as(corp_info_merged_sp, "SpatialPoints")
planning_area_sp <- as(planning_area_3414, "Spatial")
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO"): Discarded datum Unknown based on WGS84 ellipsoid in CRS definition,
##  but +towgs84= values preserved
plot(planning_area_sp, border="darkgrey")+
plot(corp_info_merged_sp, add=TRUE)

## integer(0)
unique(corp_info_merged$category)
##  [1] "G" "F" "H" "C" "N" "I" "S" "M" "Q" "L" "J" "R" "P" "E" "K" "D" "A" "O"
for (category_id in unique(corp_info_merged$category)) {
  corp_with_category <- corp_info_merged %>%
    filter(category == category_id)
  planning_area_3414[, paste0("Category ", category_id)]<- lengths(st_intersects(planning_area_3414, corp_with_category))
}
planning_area_3414 <- planning_area_3414 %>%
  mutate(Total = rowSums(across("Category G":"Category O")))
print(mean(planning_area_3414$Total))
## [1] 2396.213
planning_area_3414 <- planning_area_3414 %>%
  mutate(`Cat G Prop` = case_when(Total != 0 ~ `Category G`/Total * 1000,
                                             Total == 0 ~ 0)) %>%
  mutate(`Cat F Prop` = case_when(Total != 0 ~ `Category F`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat H Prop` = case_when(Total != 0 ~ `Category H`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat C Prop` = case_when(Total != 0 ~ `Category C`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat N Prop` = case_when(Total != 0 ~ `Category N`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat I Prop` = case_when(Total != 0 ~ `Category I`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat S Prop` = case_when(Total != 0 ~ `Category S`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat M Prop` = case_when(Total != 0 ~ `Category M`/Total * 1000,
                                       Total == 0 ~ 0)) %>%
  mutate(`Cat Q Prop` = case_when(Total != 0 ~ `Category Q`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat L Prop` = case_when(Total != 0 ~ `Category L`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat J Prop` = case_when(Total != 0 ~ `Category J`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat R Prop` = case_when(Total != 0 ~ `Category R`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat P Prop` = case_when(Total != 0 ~ `Category P`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat E Prop` = case_when(Total != 0 ~ `Category E`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat K Prop` = case_when(Total != 0 ~ `Category K`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat D Prop` = case_when(Total != 0 ~ `Category D`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat A Prop` = case_when(Total != 0 ~ `Category A`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat O Prop` = case_when(Total != 0 ~ `Category O`/Total * 1000,
                                         Total == 0 ~ 0))

0.3 Exploratory Data Analysis

0.3.1 Visualisation of Statistics

ggplot(data=planning_area_3414, aes(x=`Category M`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

ggplot(data=planning_area_3414, aes(x=`Category M`)) +
  geom_boxplot(color="black", fill="light blue")

ggplot(data=planning_area_3414, aes(x=`Cat M Prop`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

ggplot(data=planning_area_3414, aes(x=`Cat M Prop`)) +
  geom_boxplot(color="black", fill="light blue")

cat_g <- ggplot(data=planning_area_3414, 
             aes(x= `Cat G Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_f <- ggplot(data=planning_area_3414, 
             aes(x= `Cat F Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_h <- ggplot(data=planning_area_3414, 
             aes(x= `Cat H Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_c <- ggplot(data=planning_area_3414, 
             aes(x= `Cat C Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_n <- ggplot(data=planning_area_3414, 
             aes(x= `Cat N Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_i <- ggplot(data=planning_area_3414, 
             aes(x= `Cat I Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")
ggarrange(cat_g, cat_f, cat_h, cat_c, cat_n, cat_i, 
          ncol = 3, 
          nrow = 2)

cat_s <- ggplot(data=planning_area_3414, 
             aes(x= `Cat S Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_m <- ggplot(data=planning_area_3414, 
             aes(x= `Cat M Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_q <- ggplot(data=planning_area_3414, 
             aes(x= `Cat Q Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_l <- ggplot(data=planning_area_3414, 
             aes(x= `Cat L Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_j <- ggplot(data=planning_area_3414, 
             aes(x= `Cat J Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_r <- ggplot(data=planning_area_3414, 
             aes(x= `Cat R Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")
ggarrange(cat_s, cat_m, cat_q, cat_l, cat_j, cat_r, 
          ncol = 3, 
          nrow = 2)

cat_p <- ggplot(data=planning_area_3414, 
             aes(x= `Cat P Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_e <- ggplot(data=planning_area_3414, 
             aes(x= `Cat E Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_k <- ggplot(data=planning_area_3414, 
             aes(x= `Cat K Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_d <- ggplot(data=planning_area_3414, 
             aes(x= `Cat D Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_a <- ggplot(data=planning_area_3414, 
             aes(x= `Cat A Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_o <- ggplot(data=planning_area_3414, 
             aes(x= `Cat O Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")
ggarrange(cat_p, cat_e, cat_k, cat_d, cat_a, cat_o, 
          ncol = 3, 
          nrow = 2)

0.3.2 Choropleths

qtm(planning_area_3414, "Cat H Prop")

qtm(planning_area_3414, "Total")

0.3.3 Correlation Analysis

planning_area_3414_derived <- st_drop_geometry(planning_area_3414)
cluster_vars.cor = cor(planning_area_3414_derived[, 21:38])
corrplot.mixed(cluster_vars.cor,
         lower = "ellipse", 
               upper = "number",
               tl.pos = "lt",
               diag = "l",
               tl.col = "black")

0.4 Extracting Clustering Variables

cluster_vars <- planning_area_3414_derived %>%
  select("PLN_AREA_N", ends_with("Prop"))
head(cluster_vars,10)
## # A tibble: 10 x 19
##    PLN_AREA_N `Cat G Prop` `Cat F Prop` `Cat H Prop` `Cat C Prop` `Cat N Prop`
##    <chr>             <dbl>        <dbl>        <dbl>        <dbl>        <dbl>
##  1 ANG MO KIO         285.         70.9         69.8         35.1         49.4
##  2 BEDOK              266.         56.3         55.5         42.2         49.3
##  3 BISHAN             271.         47.8         55.4         30.2         53.9
##  4 BOON LAY           271.        153.         104.         164.          41.1
##  5 BUKIT BAT~         281.         70.3         81.7         43.0         59.9
##  6 BUKIT MER~         236.         35.2         54.7         28.2         47.5
##  7 BUKIT PAN~         271.         41.8        116.          37.0         63.7
##  8 BUKIT TIM~         208.         22.1         32.2         22.5         47.9
##  9 CHANGI             256.         56.8        227.          45.5         39.8
## 10 CHOA CHU ~         266.         57.3        133.          34.1         65.5
## # ... with 13 more variables: `Cat I Prop` <dbl>, `Cat S Prop` <dbl>, `Cat M
## #   Prop` <dbl>, `Cat Q Prop` <dbl>, `Cat L Prop` <dbl>, `Cat J Prop` <dbl>,
## #   `Cat R Prop` <dbl>, `Cat P Prop` <dbl>, `Cat E Prop` <dbl>, `Cat K
## #   Prop` <dbl>, `Cat D Prop` <dbl>, `Cat A Prop` <dbl>, `Cat O Prop` <dbl>
row.names(cluster_vars) <- cluster_vars$`PLN_AREA_N`
## Warning: Setting row names on a tibble is deprecated.
head(cluster_vars,10)
## # A tibble: 10 x 19
##    PLN_AREA_N `Cat G Prop` `Cat F Prop` `Cat H Prop` `Cat C Prop` `Cat N Prop`
##    <chr>             <dbl>        <dbl>        <dbl>        <dbl>        <dbl>
##  1 ANG MO KIO         285.         70.9         69.8         35.1         49.4
##  2 BEDOK              266.         56.3         55.5         42.2         49.3
##  3 BISHAN             271.         47.8         55.4         30.2         53.9
##  4 BOON LAY           271.        153.         104.         164.          41.1
##  5 BUKIT BAT~         281.         70.3         81.7         43.0         59.9
##  6 BUKIT MER~         236.         35.2         54.7         28.2         47.5
##  7 BUKIT PAN~         271.         41.8        116.          37.0         63.7
##  8 BUKIT TIM~         208.         22.1         32.2         22.5         47.9
##  9 CHANGI             256.         56.8        227.          45.5         39.8
## 10 CHOA CHU ~         266.         57.3        133.          34.1         65.5
## # ... with 13 more variables: `Cat I Prop` <dbl>, `Cat S Prop` <dbl>, `Cat M
## #   Prop` <dbl>, `Cat Q Prop` <dbl>, `Cat L Prop` <dbl>, `Cat J Prop` <dbl>,
## #   `Cat R Prop` <dbl>, `Cat P Prop` <dbl>, `Cat E Prop` <dbl>, `Cat K
## #   Prop` <dbl>, `Cat D Prop` <dbl>, `Cat A Prop` <dbl>, `Cat O Prop` <dbl>
sg_business <- select(cluster_vars, c(2:19))
row.names(sg_business) <- cluster_vars$`PLN_AREA_N`
## Warning: Setting row names on a tibble is deprecated.
head(sg_business, 10)
## # A tibble: 10 x 18
##    `Cat G Prop` `Cat F Prop` `Cat H Prop` `Cat C Prop` `Cat N Prop` `Cat I Prop`
##           <dbl>        <dbl>        <dbl>        <dbl>        <dbl>        <dbl>
##  1         285.         70.9         69.8         35.1         49.4         76.6
##  2         266.         56.3         55.5         42.2         49.3         55.0
##  3         271.         47.8         55.4         30.2         53.9         48.6
##  4         271.        153.         104.         164.          41.1         35.6
##  5         281.         70.3         81.7         43.0         59.9         48.8
##  6         236.         35.2         54.7         28.2         47.5         81.8
##  7         271.         41.8        116.          37.0         63.7         50.7
##  8         208.         22.1         32.2         22.5         47.9         62.8
##  9         256.         56.8        227.          45.5         39.8        159. 
## 10         266.         57.3        133.          34.1         65.5         71.2
## # ... with 12 more variables: `Cat S Prop` <dbl>, `Cat M Prop` <dbl>, `Cat Q
## #   Prop` <dbl>, `Cat L Prop` <dbl>, `Cat J Prop` <dbl>, `Cat R Prop` <dbl>,
## #   `Cat P Prop` <dbl>, `Cat E Prop` <dbl>, `Cat K Prop` <dbl>, `Cat D
## #   Prop` <dbl>, `Cat A Prop` <dbl>, `Cat O Prop` <dbl>

0.5 Data Standardisation

0.5.1 Min-max Standardisation

sg_business.minmax <- normalize(sg_business)
summary(sg_business.minmax)
##    Cat G Prop       Cat F Prop       Cat H Prop       Cat C Prop    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.5520   1st Qu.:0.1203   1st Qu.:0.1350   1st Qu.:0.1261  
##  Median :0.6537   Median :0.2728   Median :0.2493   Median :0.1690  
##  Mean   :0.5962   Mean   :0.2813   Mean   :0.2853   Mean   :0.2520  
##  3rd Qu.:0.6891   3rd Qu.:0.3721   3rd Qu.:0.4236   3rd Qu.:0.2431  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##    Cat N Prop       Cat I Prop        Cat S Prop       Cat M Prop    
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.2725   1st Qu.:0.04737   1st Qu.:0.2520   1st Qu.:0.4437  
##  Median :0.3459   Median :0.06124   Median :0.4363   Median :0.5501  
##  Mean   :0.3451   Mean   :0.08322   Mean   :0.4040   Mean   :0.5322  
##  3rd Qu.:0.3883   3rd Qu.:0.07372   3rd Qu.:0.5241   3rd Qu.:0.6377  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.0000   Max.   :1.0000  
##    Cat Q Prop       Cat L Prop       Cat J Prop       Cat R Prop    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.1290   1st Qu.:0.1395   1st Qu.:0.2347   1st Qu.:0.2210  
##  Median :0.1930   Median :0.3167   Median :0.3065   Median :0.3097  
##  Mean   :0.2351   Mean   :0.2990   Mean   :0.3341   Mean   :0.3279  
##  3rd Qu.:0.2755   3rd Qu.:0.3795   3rd Qu.:0.4391   3rd Qu.:0.4036  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##    Cat P Prop       Cat E Prop        Cat K Prop        Cat D Prop     
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.3090   1st Qu.:0.01327   1st Qu.:0.08158   1st Qu.:0.00000  
##  Median :0.5315   Median :0.02253   Median :0.16233   Median :0.08987  
##  Mean   :0.4641   Mean   :0.07622   Mean   :0.20481   Mean   :0.14194  
##  3rd Qu.:0.6175   3rd Qu.:0.05097   3rd Qu.:0.24424   3rd Qu.:0.19562  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##    Cat A Prop         Cat O Prop     
##  Min.   :0.000000   Min.   :0.00000  
##  1st Qu.:0.005422   1st Qu.:0.00000  
##  Median :0.011862   Median :0.00000  
##  Mean   :0.041839   Mean   :0.08626  
##  3rd Qu.:0.019626   3rd Qu.:0.00000  
##  Max.   :1.000000   Max.   :1.00000

0.5.2 Z-score Standardisation

sg_business.zscore <- scale(sg_business)
summary(sg_business.zscore)
##    Cat G Prop        Cat F Prop         Cat H Prop        Cat C Prop      
##  Min.   :-3.2891   Min.   :-1.26781   Min.   :-1.4063   Min.   :-1.08528  
##  1st Qu.:-0.2443   1st Qu.:-0.72563   1st Qu.:-0.7407   1st Qu.:-0.54217  
##  Median : 0.3171   Median :-0.03849   Median :-0.1776   Median :-0.35773  
##  Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.5123   3rd Qu.: 0.40894   3rd Qu.: 0.6816   3rd Qu.:-0.03851  
##  Max.   : 2.2273   Max.   : 3.23879   Max.   : 3.5230   Max.   : 3.22094  
##    Cat N Prop          Cat I Prop         Cat S Prop        Cat M Prop      
##  Min.   :-2.401438   Min.   :-0.59324   Min.   :-1.9315   Min.   :-2.51272  
##  1st Qu.:-0.504996   1st Qu.:-0.25554   1st Qu.:-0.7267   1st Qu.:-0.41756  
##  Median : 0.005169   Median :-0.15669   Median : 0.1544   Median : 0.08464  
##  Mean   : 0.000000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.300250   3rd Qu.:-0.06767   3rd Qu.: 0.5740   3rd Qu.: 0.49830  
##  Max.   : 4.556796   Max.   : 6.53578   Max.   : 2.8490   Max.   : 2.20878  
##    Cat Q Prop        Cat L Prop         Cat J Prop        Cat R Prop      
##  Min.   :-1.1376   Min.   :-1.35192   Min.   :-1.6015   Min.   :-1.64930  
##  1st Qu.:-0.5132   1st Qu.:-0.72102   1st Qu.:-0.4765   1st Qu.:-0.53792  
##  Median :-0.2037   Median : 0.08004   Median :-0.1321   Median :-0.09156  
##  Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.1954   3rd Qu.: 0.36412   3rd Qu.: 0.5034   3rd Qu.: 0.38078  
##  Max.   : 3.7003   Max.   : 3.17018   Max.   : 3.1919   Max.   : 3.38011  
##    Cat P Prop        Cat E Prop        Cat K Prop        Cat D Prop     
##  Min.   :-1.8734   Min.   :-0.4334   Min.   :-1.0844   Min.   :-0.7178  
##  1st Qu.:-0.6262   1st Qu.:-0.3579   1st Qu.:-0.6524   1st Qu.:-0.7178  
##  Median : 0.2719   Median :-0.3052   Median :-0.2249   Median :-0.2633  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.6192   3rd Qu.:-0.1436   3rd Qu.: 0.2088   3rd Qu.: 0.2714  
##  Max.   : 2.1631   Max.   : 5.2525   Max.   : 4.2101   Max.   : 4.3392  
##    Cat A Prop        Cat O Prop     
##  Min.   :-0.2732   Min.   :-0.3554  
##  1st Qu.:-0.2378   1st Qu.:-0.3554  
##  Median :-0.1957   Median :-0.3554  
##  Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.:-0.1450   3rd Qu.:-0.3554  
##  Max.   : 6.2558   Max.   : 3.7644

0.5.3 Visualising the standardised clustering variables

r <- ggplot(data=planning_area_3414_derived, 
             aes(x= `Cat M Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

sg_business_minmax_df <- as.data.frame(sg_business.minmax)
s <- ggplot(data=sg_business_minmax_df, 
       aes(x=`Cat M Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue") +
  ggtitle("Min-Max Standardisation")

sg_business_zscore_df <- as.data.frame(sg_business.zscore)
z <- ggplot(data=sg_business_zscore_df, 
       aes(x=`Cat M Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue") +
  ggtitle("Z-score Standardisation")

ggarrange(r, s, z,
          ncol = 3,
          nrow = 1)

0.6 Hierarchical Cluster Analysis

0.6.1 Computing Proximity Matrix

proxmat <- dist(sg_business, method = 'euclidean')

0.6.2 Computing Hierarchical Clustering

hclust_ward <- hclust(proxmat, method = 'ward.D')
plot(hclust_ward, cex = 0.6)

0.6.3 Selecting Optimal Clustering Algorithm

m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")

ac <- function(x) {
  agnes(sg_business, method = x)$ac
}

map_dbl(m, ac)
##   average    single  complete      ward 
## 0.8845678 0.8922229 0.8896038 0.9075350

With reference to the output above, we can see that Ward’s method provides the strongest clustering structure among the four methods assessed. Hence, in the subsequent analysis, only Ward’s method will be used.

0.6.4 Determining Optimal Clusters

set.seed(54321)
gap_stat <- clusGap(sg_business, FUN = hcut, nstart = 25, K.max = 10, B = 50)
# Print the result
print(gap_stat, method = "firstmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = sg_business, FUNcluster = hcut, K.max = 10, B = 50,     nstart = 25)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
##  --> Number of clusters (method 'firstmax'): 1
##           logW   E.logW       gap     SE.sim
##  [1,] 7.904633 8.665080 0.7604468 0.04103340
##  [2,] 7.725283 8.360427 0.6351444 0.03845202
##  [3,] 7.507726 8.229915 0.7221892 0.03267079
##  [4,] 7.415381 8.129552 0.7141711 0.02971707
##  [5,] 7.266590 8.046361 0.7797706 0.03046773
##  [6,] 7.192193 7.974757 0.7825644 0.03056135
##  [7,] 7.108255 7.914675 0.8064201 0.03002151
##  [8,] 7.015753 7.858996 0.8432428 0.03013872
##  [9,] 6.941829 7.805217 0.8633878 0.02932208
## [10,] 6.871708 7.753003 0.8812947 0.02814277
fviz_gap_stat(gap_stat)

0.6.5 Interpreting the Dendrogram

plot(hclust_ward, cex = 0.6)
rect.hclust(hclust_ward, k = 10, border = 2:5)

0.6.6 Visually Driven Hierarchical Analysis

sg_business_mat <- data.matrix(sg_business)
heatmaply(normalize(sg_business_mat),
          Colv=NA,
          dist_method = "euclidean",
          hclust_method = "ward.D",
          seriate = "OLO",
          colors = Blues,
          k_row = 10,
          margins = c(NA,200,60,NA),
          fontsize_row = 4,
          fontsize_col = 5,
          main="Geographic Segmentation of Singapore by Business Prominence",
          xlab = "Business Prominence",
          ylab = "Singapore's Planning Areas"
          )

0.6.7 Mapping the Clusters Formed

groups <- as.factor(cutree(hclust_ward, k=10))
sg_biz_cluster <- cbind(planning_area_3414, as.matrix(groups)) %>%
  rename(`CLUSTER`=`as.matrix.groups.`)
qtm(sg_biz_cluster, "CLUSTER")

0.7 Spatially Constrained Clustering (SKATER)

0.7.1 Conversion to SpatialPolygonsDataFrame

planning_area_3414_sp <- as_Spatial(planning_area_3414)
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO"): Discarded datum Unknown based on WGS84 ellipsoid in CRS definition,
##  but +towgs84= values preserved

0.7.2 Computing Neighbour List

sg.nb <- poly2nb(planning_area_3414_sp)
summary(sg.nb)
## Neighbour list object:
## Number of regions: 47 
## Number of nonzero links: 212 
## Percentage nonzero weights: 9.597103 
## Average number of links: 4.510638 
## Link number distribution:
## 
##  1  2  3  4  5  6  7  8 
##  1  3  8 13 10  8  1  3 
## 1 least connected region:
## 44 with 1 link
## 3 most connected regions:
## 17 36 41 with 8 links
plot(planning_area_3414_sp, border=grey(.5))
plot(sg.nb, coordinates(planning_area_3414_sp), col="blue", add=TRUE)

0.7.3 Computing Minimum Spanning Tree

0.7.3.1 Calculate Edge Cost

lcosts <- nbcosts(sg.nb, sg_business)
sg.w <- nb2listw(sg.nb, lcosts, style="B")
summary(sg.w)
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 47 
## Number of nonzero links: 212 
## Percentage nonzero weights: 9.597103 
## Average number of links: 4.510638 
## Link number distribution:
## 
##  1  2  3  4  5  6  7  8 
##  1  3  8 13 10  8  1  3 
## 1 least connected region:
## 44 with 1 link
## 3 most connected regions:
## 17 36 41 with 8 links
## 
## Weights style: B 
## Weights constants summary:
##    n   nn      S0       S1        S2
## B 47 2209 36488.8 24823350 168882011

0.7.3.2 Creating Minimum Spanning Tree

sg.mst <- mstree(sg.w)
class(sg.mst)
## [1] "mst"    "matrix"
dim(sg.mst)
## [1] 46  3
head(sg.mst)
##      [,1] [,2]     [,3]
## [1,]   14   37 44.92823
## [2,]   14    2 49.55329
## [3,]   37    1 52.85235
## [4,]    1   47 61.51380
## [5,]   14   36 63.18488
## [6,]   36   30 36.39988
plot(planning_area_sp, border=gray(.5))
plot.mst(sg.mst, coordinates(planning_area_sp), 
     col="blue", cex.lab=0.7, cex.circles=0.005, add=TRUE)

skaterclust <- skater(sg.mst[,1:2], sg_business, method = "euclidean", 9)
str(skaterclust)
## List of 8
##  $ groups      : num [1:47] 1 1 1 5 1 6 1 1 1 1 ...
##  $ edges.groups:List of 10
##   ..$ :List of 3
##   .. ..$ node: num [1:28] 47 40 46 1 35 37 17 14 3 42 ...
##   .. ..$ edge: num [1:27, 1:3] 40 46 47 1 35 37 37 17 14 3 ...
##   .. ..$ ssw : num 2227
##   ..$ :List of 3
##   .. ..$ node: num 20
##   .. ..$ edge: num[0 , 1:3] 
##   .. ..$ ssw : num 0
##   ..$ :List of 3
##   .. ..$ node: num [1:6] 41 32 22 23 38 25
##   .. ..$ edge: num [1:5, 1:3] 22 32 23 41 41 38 22 25 32 23 ...
##   .. ..$ ssw : num 574
##   ..$ :List of 3
##   .. ..$ node: num 44
##   .. ..$ edge: num[0 , 1:3] 
##   .. ..$ ssw : num 0
##   ..$ :List of 3
##   .. ..$ node: num [1:3] 4 29 43
##   .. ..$ edge: num [1:2, 1:3] 4 29 29 43 102 ...
##   .. ..$ ssw : num 156
##   ..$ :List of 3
##   .. ..$ node: num [1:4] 6 31 26 12
##   .. ..$ edge: num [1:3, 1:3] 26 6 31 12 26 ...
##   .. ..$ ssw : num 287
##   ..$ :List of 3
##   .. ..$ node: num 18
##   .. ..$ edge: num[0 , 1:3] 
##   .. ..$ ssw : num 0
##   ..$ :List of 3
##   .. ..$ node: num 45
##   .. ..$ edge: num[0 , 1:3] 
##   .. ..$ ssw : num 0
##   ..$ :List of 3
##   .. ..$ node: num 28
##   .. ..$ edge: num[0 , 1:3] 
##   .. ..$ ssw : num 0
##   ..$ :List of 3
##   .. ..$ node: num 34
##   .. ..$ edge: num[0 , 1:3] 
##   .. ..$ ssw : num 0
##  $ not.prune   : NULL
##  $ candidates  : int [1:10] 1 2 3 4 5 6 7 8 9 10
##  $ ssto        : num 7439
##  $ ssw         : num [1:10] 7439 6366 5546 4970 4501 ...
##  $ crit        : num [1:2] 1 Inf
##  $ vec.crit    : num [1:47] 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "class")= chr "skater"
clustergrps <- skaterclust$groups
table(clustergrps)
## clustergrps
##  1  2  3  4  5  6  7  8  9 10 
## 28  1  6  1  3  4  1  1  1  1
plot(planning_area_sp, border=gray(.5))
plot(skaterclust, coordinates(planning_area_3414_sp), cex.lab=.7,
     groups.colors=c("red","green","blue", "brown", "pink"), cex.circles=0.005, add=TRUE)
## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter

## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter

## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter

## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter

groups_mat <- as.matrix(skaterclust$groups)
sg_biz_spatialcluster <- cbind(sg_biz_cluster, as.factor(groups_mat)) %>%
  rename(`SP_CLUSTER`=`as.factor.groups_mat.`)
qtm(sg_biz_spatialcluster, "SP_CLUSTER")